"""
PDF Storage Manager for Research Library

Handles PDF storage across three modes:
- none: Don't store PDFs (text-only)
- filesystem: Store PDFs unencrypted on disk (fast, external tool compatible)
- database: Store PDFs encrypted in SQLCipher database (secure, portable)
"""

import hashlib
import re
from datetime import datetime, UTC
from pathlib import Path
from typing import Optional, Tuple
from urllib.parse import urlparse

from loguru import logger
from sqlalchemy.orm import Session

from ...database.models.library import Document, DocumentBlob
from ...security.path_validator import PathValidator


class PDFStorageManager:
    """Unified interface for PDF storage across all modes."""

    def __init__(
        self, library_root: Path, storage_mode: str, max_pdf_size_mb: int = 100
    ):
        """
        Initialize PDF storage manager.

        Args:
            library_root: Base directory for filesystem storage
            storage_mode: One of 'none', 'filesystem', 'database'
            max_pdf_size_mb: Maximum PDF file size in MB (default 100)
        """
        self.library_root = Path(library_root)
        self.storage_mode = storage_mode
        self.max_pdf_size_bytes = max_pdf_size_mb * 1024 * 1024

        if storage_mode not in ("none", "filesystem", "database"):
            logger.warning(
                f"Unknown storage mode '{storage_mode}', defaulting to 'none'"
            )
            self.storage_mode = "none"

    def save_pdf(
        self,
        pdf_content: bytes,
        document: Document,
        session: Session,
        filename: str,
        url: Optional[str] = None,
        resource_id: Optional[int] = None,
    ) -> Tuple[Optional[str], int]:
        """
        Save PDF based on configured storage mode.

        Args:
            pdf_content: Raw PDF bytes
            document: Document model instance
            session: Database session
            filename: Filename to use for saving
            url: Source URL (for generating better filenames)
            resource_id: Resource ID (for generating better filenames)

        Returns:
            Tuple of (file_path or storage indicator, file_size)
            - For filesystem: relative path string
            - For database: "database"
            - For none: None
        """
        file_size = len(pdf_content)

        # Check file size limit
        if file_size > self.max_pdf_size_bytes:
            max_mb = self.max_pdf_size_bytes / (1024 * 1024)
            logger.warning(
                f"PDF size ({file_size / (1024 * 1024):.1f}MB) exceeds limit "
                f"({max_mb:.0f}MB), skipping storage"
            )
            return None, file_size

        if self.storage_mode == "none":
            logger.debug("PDF storage mode is 'none' - skipping PDF save")
            return None, file_size

        elif self.storage_mode == "filesystem":
            file_path = self._save_to_filesystem(
                pdf_content, filename, url, resource_id
            )
            relative_path = str(file_path.relative_to(self.library_root))
            document.storage_mode = "filesystem"
            document.file_path = relative_path
            logger.info(f"PDF saved to filesystem: {relative_path}")
            return relative_path, file_size

        elif self.storage_mode == "database":
            self._save_to_database(pdf_content, document, session)
            document.storage_mode = "database"
            document.file_path = None  # No filesystem path
            logger.info(f"PDF saved to database for document {document.id}")
            return "database", file_size

        return None, file_size

    def load_pdf(self, document: Document, session: Session) -> Optional[bytes]:
        """
        Load PDF - check database first, then filesystem.

        Smart retrieval: doesn't rely on storage_mode column, actually checks
        where the PDF exists.

        Args:
            document: Document model instance
            session: Database session

        Returns:
            PDF bytes or None if not available
        """
        # 1. Check database first
        pdf_bytes = self._load_from_database(document, session)
        if pdf_bytes:
            logger.debug(f"Loaded PDF from database for document {document.id}")
            return pdf_bytes

        # 2. Fallback to filesystem
        pdf_bytes = self._load_from_filesystem(document)
        if pdf_bytes:
            logger.debug(
                f"Loaded PDF from filesystem for document {document.id}"
            )
            return pdf_bytes

        logger.debug(f"No PDF available for document {document.id}")
        return None

    def has_pdf(self, document: Document, session: Session) -> bool:
        """
        Check if PDF is available without loading the actual bytes.

        Args:
            document: Document model instance
            session: Database session

        Returns:
            True if PDF is available (in database or filesystem)
        """
        # Must be a PDF file type
        if document.file_type != "pdf":
            return False

        # Check database first (has blob?)
        from ...database.models.library import DocumentBlob

        has_blob = (
            session.query(DocumentBlob.id)
            .filter_by(document_id=document.id)
            .first()
            is not None
        )
        if has_blob:
            return True

        # Check filesystem
        if document.file_path and document.file_path not in (
            "metadata_only",
            "text_only_not_stored",
        ):
            file_path = self.library_root / document.file_path
            if file_path.exists():
                return True

        return False

    def _infer_storage_mode(self, document: Document) -> str:
        """
        Infer storage mode for documents without explicit mode set.
        Used for backward compatibility with existing documents.
        """
        # If there's a blob, it's database storage
        if hasattr(document, "blob") and document.blob:
            return "database"
        # If there's a file_path (and not 'metadata_only'), it's filesystem
        if document.file_path and document.file_path != "metadata_only":
            return "filesystem"
        # Otherwise no storage
        return "none"

    def _save_to_filesystem(
        self,
        pdf_content: bytes,
        filename: str,
        url: Optional[str] = None,
        resource_id: Optional[int] = None,
    ) -> Path:
        """
        Save PDF to filesystem with organized structure.

        Returns:
            Absolute path to saved file
        """
        # Generate better filename if URL is provided
        if url:
            filename = self._generate_filename(url, resource_id, filename)

        # Create simple flat directory structure - all PDFs in one folder
        pdf_path = self.library_root / "pdfs"
        pdf_path.mkdir(parents=True, exist_ok=True)

        # Use PathValidator with relative path from library_root
        relative_path = f"pdfs/{filename}"
        validated_path = PathValidator.validate_safe_path(
            relative_path,
            base_dir=str(self.library_root),
            required_extensions=(".pdf",),
        )

        if not validated_path:
            raise ValueError("Invalid file path")

        # Write the PDF file with security verification
        from ...security.file_write_verifier import write_file_verified

        write_file_verified(
            validated_path,
            pdf_content,
            "research_library.pdf_storage_mode",
            "filesystem",
            "library PDF storage",
            mode="wb",
        )

        return Path(validated_path)

    def _save_to_database(
        self, pdf_content: bytes, document: Document, session: Session
    ) -> None:
        """Store PDF in document_blobs table."""
        # Check if blob already exists
        existing_blob = (
            session.query(DocumentBlob)
            .filter_by(document_id=document.id)
            .first()
        )

        if existing_blob:
            # Update existing blob
            existing_blob.pdf_binary = pdf_content
            existing_blob.blob_hash = hashlib.sha256(pdf_content).hexdigest()
            existing_blob.stored_at = datetime.now(UTC)
            logger.debug(f"Updated existing blob for document {document.id}")
        else:
            # Create new blob
            blob = DocumentBlob(
                document_id=document.id,
                pdf_binary=pdf_content,
                blob_hash=hashlib.sha256(pdf_content).hexdigest(),
                stored_at=datetime.now(UTC),
            )
            session.add(blob)
            logger.debug(f"Created new blob for document {document.id}")

    def _load_from_filesystem(self, document: Document) -> Optional[bytes]:
        """Load PDF from filesystem."""
        if not document.file_path or document.file_path == "metadata_only":
            return None

        file_path = self.library_root / document.file_path

        if not file_path.exists():
            logger.warning(f"PDF file not found: {file_path}")
            return None

        try:
            return file_path.read_bytes()
        except Exception:
            logger.exception(f"Failed to read PDF from {file_path}")
            return None

    def _load_from_database(
        self, document: Document, session: Session
    ) -> Optional[bytes]:
        """Load PDF from document_blobs table."""
        blob = (
            session.query(DocumentBlob)
            .filter_by(document_id=document.id)
            .first()
        )

        if not blob:
            logger.debug(f"No blob found for document {document.id}")
            return None

        # Update last accessed timestamp
        blob.last_accessed = datetime.now(UTC)

        return blob.pdf_binary

    def _generate_filename(
        self, url: str, resource_id: Optional[int], fallback_filename: str
    ) -> str:
        """Generate a meaningful filename from URL."""
        parsed_url = urlparse(url)
        hostname = parsed_url.hostname or ""
        timestamp = datetime.now(UTC).strftime("%Y%m%d")

        if hostname == "arxiv.org" or hostname.endswith(".arxiv.org"):
            # Extract arXiv ID
            match = re.search(r"(\d{4}\.\d{4,5})", url)
            if match:
                return f"arxiv_{match.group(1)}.pdf"
            return f"arxiv_{timestamp}_{resource_id or 'unknown'}.pdf"

        elif hostname == "ncbi.nlm.nih.gov" and "/pmc" in parsed_url.path:
            # Extract PMC ID
            match = re.search(r"(PMC\d+)", url)
            if match:
                return f"pmc_{match.group(1)}.pdf"
            return f"pubmed_{timestamp}_{resource_id or 'unknown'}.pdf"

        # Use fallback filename
        return fallback_filename

    def delete_pdf(self, document: Document, session: Session) -> bool:
        """
        Delete PDF for a document.

        Args:
            document: Document model instance
            session: Database session

        Returns:
            True if deletion succeeded
        """
        storage_mode = document.storage_mode or self._infer_storage_mode(
            document
        )

        try:
            if storage_mode == "filesystem":
                if document.file_path and document.file_path != "metadata_only":
                    file_path = self.library_root / document.file_path
                    if file_path.exists():
                        file_path.unlink()
                        logger.info(f"Deleted PDF file: {file_path}")
                document.file_path = None
                document.storage_mode = "none"
                return True

            elif storage_mode == "database":
                blob = (
                    session.query(DocumentBlob)
                    .filter_by(document_id=document.id)
                    .first()
                )
                if blob:
                    session.delete(blob)
                    logger.info(f"Deleted PDF blob for document {document.id}")
                document.storage_mode = "none"
                return True

            return True  # Nothing to delete for 'none' mode

        except Exception:
            logger.exception(f"Failed to delete PDF for document {document.id}")
            return False

    def upgrade_to_pdf(
        self, document: Document, pdf_content: bytes, session: Session
    ) -> bool:
        """
        Upgrade a text-only document to include PDF storage.

        If document already has a PDF stored, returns False (no action needed).
        If document is text-only, adds the PDF blob and updates storage_mode.

        Args:
            document: Document model instance
            pdf_content: Raw PDF bytes
            session: Database session

        Returns:
            True if PDF was added, False if already had PDF or failed
        """
        # Only upgrade if document is currently text-only
        if document.storage_mode not in (None, "none"):
            logger.debug(
                f"Document {document.id} already has storage_mode={document.storage_mode}"
            )
            return False

        # Check if blob already exists (shouldn't happen, but be safe)
        existing_blob = (
            session.query(DocumentBlob)
            .filter_by(document_id=document.id)
            .first()
        )
        if existing_blob:
            logger.debug(f"Document {document.id} already has a blob")
            return False

        # Check file size
        file_size = len(pdf_content)
        if file_size > self.max_pdf_size_bytes:
            max_mb = self.max_pdf_size_bytes / (1024 * 1024)
            logger.warning(
                f"PDF size ({file_size / (1024 * 1024):.1f}MB) exceeds limit "
                f"({max_mb:.0f}MB), skipping upgrade"
            )
            return False

        try:
            # Add the PDF blob
            self._save_to_database(pdf_content, document, session)
            document.storage_mode = "database"
            document.file_path = None
            logger.info(f"Upgraded document {document.id} with PDF blob")
            return True
        except Exception:
            logger.exception(
                f"Failed to upgrade document {document.id} with PDF"
            )
            return False
